* This script assembles the USSC data (USSCdata.dta) and TRAC data (TRACdata.dta) used in Spamann's comment on Cho et al. from raw USSC and TRAC data, respectively
* The raw USSC and TRAC data are available at
*** USSC: http://dx.doi.org/10.7910/DVN/TZRNKD (Dataverse): files opafy92nid.dta-opafy03nid.dta -- make sure to download the original .dta file format, not .tab etc.
*** TRAC: email trac@syr.edu, who will make the data available to researchers from subscriber institutions (for reference purposes, I obtained the data as download from http://trac.syr.edu/projects/judges/0416/extract.zip)

* TO RUN THIS SCRIPT
* 1. create a directory X where you want to create the data
* 2. create folders "USSC" and "TRAC" inside that directory X and put the raw USSC and TRAC data, respectively, into those two folders (script assumes TRAC data are a zipped csv file, USSC data unzipped dta files)
* 3. point the cd command in line 12 below to your directory X

set more off
cd [REPLACE THESE SQUARE BRACKETS WITH YOUR DIRECTORY PATH TO X]

***************************************************************
**** 0.1 Calendar file

* generate a file marking beginning of DST and Mondays before and after
tempfile calendar
clear
set obs 11500 // to go through 21000 (early 2017)
gen sentdate = _n + 9861 // 9862 is Jan 1, 1987, and 1987 is year when start date of DST changed to what it was in Cho sample
format sentdate %td
gen month = month(sentdate)
gen dow = dow(sentdate)
gen sentyr = year(sentdate)
gen DSTbegin	  = dow(sentdate)==1 & ((sentyr< 2007 & month==4 & inrange(day(sentdate), 2, 8))  ///
									   |(sentyr>=2007 & month==3 & inrange(day(sentdate), 9,15)))
gen post_DSTbegin = DSTbegin[_n-7]==1
gen pre_DSTbegin  = DSTbegin[_n+7]==1								   
compress
save `calendar', replace

******************************************************************
**** 0.2. USSC

* ingesting and collating original USSC files
clear
local USSCvariables "AGE CIRCDIST HISPORIG MONRACE MONSEX NEWCIT NEWCNVTN NEWRACE NEWEDUC NOCOUNTS NUMDEPEN OFFTYPE2 PROBATN SENSPLT SENSPLT0 SENTDATE SENTTOT SENTTOT0 STATMIN USSCIDN XCRHISSR XFOLSOR"
foreach year of numlist 92/99 0/3 {
	if inrange(`year',0,3) local yr "0`year'"
	else local yr "`year'"
	disp "now appending `yr'"
	if inrange(`year',92,98)  append using USSC/opafy`yr'nid.dta, keep(`USSCvariables' OFFCASE PETTYBC)
	else					append using USSC/opafy`yr'nid.dta, keep(`USSCvariables')
	if `year'==92 gen ussc_fy=1992
	else if `year'<20 recode ussc_fy (.=`=2000+`year'')
	else recode ussc_fy (.=`=1900+`year'')
	}
rename *, lower

drop if mi(sentdate) // 0 drops. Sanity check only -- USSC data until 2003 should all have dates
duplicates drop  // 0 duplicates. Sanity check only -- the USSC ID is in the data, so this would concern only genuine duplicates (errors)
sort usscidn ussc_fy
duplicates drop usscidn, force // 56 observations appear in different fiscal years

* renaming, recoding, and cleaning variables
rename (circdist newcnvtn age) (distr trial offage)
label var trial "plea (0) or trial (1) (NEWCNVTN from USSC)"
label var distr "CIRCDIST from USSC"
label var offage "AGE from USSC"
gen offage2 = offage^2
recode monsex (0 = 1) (1 = 0) (9 = .), gen(gender) // 9 is "missing", flipping 0 and 1 corresponds to Cho et al. coding (according to match results)
recode xcrhissr (8 9 = .), gen(crimhist) // 9 is "missing",  8 is "N/A or conviction solely under 18$924(c)" in 1991-98; 8/9 don't exist after 1998
recode xfolsor (93/99 = .), gen(offensl) // 99 is a missing code, and 93-95 special codes, for xfolsor in 1991-98; values not around in later years
recode newrace (1 = 0) (2 = 1) (else = .), gen(race) // Cho et al. only use black/white ...
recode monrace (1=0) (2=1) (nonmiss = 2), gen(mrace)  // ... but in so doing also drop all Hispanic blacks & whites, and others. mrace preserves both ...
recode mrace (2 = .), gen(mrace_bw) // ... whereas mrace_bw only preserves Hispanics (i.e., keeps all black or white but not Asian etc.)
recode hisporig (1 = 0) (2 = 1) (else = .), gen(hispanic)
replace nocounts = . if nocounts==99 & ussc_fy<=1998
recode nocounts (1 = 0) (nonmissing = 1), gen(noconvic)
recode numdepen (97 = 1), gen(dependents) // 97 means at least one dependent, exact number unknown
recode dependents (99 = .) if ussc_fy<=1998 // 99 is missing value code in 1992-98

tab offcase if offtype2==27 // most of immigration cases 1992-98 (29k total) are illegal entry (121021 - 18k), or fraudulently acquiring entry docs (122021 - 3k) or passport (122041 - 300)
recode offtype2 (27 = 1) (99 = .) (nonmissing = 0), gen(immigrationcase) // I leave this in the data to investigate the role of immigration cases, if any (none in my analysis)

gen csenttot = senttot
gen csenttt0 = senttot0
gen cstatmin = statmin
recode csenttot csenttt0 cstatmin (470/990  = 470)	( 991 / max = .) if ussc_fy<=1998 // 470 is life, 990  is supposedly max
recode csenttot csenttt0 cstatmin (470/9997 = 470)	(9998 / max = .) if ussc_fy>=1999 // 470 is life. 9997 is supposedly the maximum value, according to codebook -- but there is one with 11520
gen logsent = ln(csenttot)
gen logsent1 = ln(1+csenttt0)
recode csenttt0 (0 = 0) (nonmissing = 1) (missing = .), gen(prison)

* creating some dummies (because Stata's mixed (HLM) command can't handle temporary variables)
fvrevar i.crimhist, stub(crmhst)
fvrevar b6.neweduc, stub(edu)
fvrevar i.newrace, stub(newrace)
fvrevar i.mrace, stub(mrace)
drop crmhst1 edu4 newrace1 mrace1 //  NB: the text of Cho et al. would suggest that edu1 (neweduc==1) is omitted category, but merge suggests otherwise
rename crmhst# crmhst#, renumber // go from 2-6 to 1-5, as in Cho et al.'s data

* adding "sleepy Monday" etc. date identifiers
merge m:1 sentdate using `calendar', assert(2 3) keep(3) nogenerate
gen controlstate = inlist(distr,49,50,64,70)
gen sleepy = DSTbegin & !controlstate

* compress and save
xtset distr
compress
save USSCdata.dta, replace


***************************************************************
**** 0.3 TRAC

* ingest original TRAC file
unzipfile TRAC/extract.zip, replace
import delimited extract.csv, varnames(1) clear

* clean dates
drop if mi(sentencingdate)
tostring sentencingdate, replace
replace sentencingdate = "0"   +sentencingdate if length(sentencingdate)==5
replace sentencingdate = "00"  +sentencingdate if length(sentencingdate)==4
replace sentencingdate = "000" +sentencingdate if length(sentencingdate)==3
gen sentdate=date(sentencingdate,"YMD",2016)

* clean decision types: TRAC also contains magistrate judges, resentencing decisions, etc., so need to filter
keep if courttype=="DC" // district court, not magistrate (eliminates 630k obs.)
keep if dispositiontype=="GT" // "GT" is "guilty" -- all but 21k obs. If not guilty, then there can be (should be) no sentence (except possibly nolo contendere = ~300 cases)
gen trial = 1 if inlist(dispositionreason, "BTRD", "BTRM", "JTRD", "JTRM") // trial types
recode trial (. = 0) if inlist(dispositionreason, "PLED", "PLEM", "PLOD", "PLOM", "PUCP", "PUIF", "PUIN") // pleas
drop if mi(trial) // this concerns only 12k obs. Presumably all real initial sentencing decisions should happen after plea or trial
sort fiscalyear
duplicates drop district caseid participantid sentdate, force // if a record is duplicated later, I retain the earlier one. 18k obs. dropped
sort sentdate
duplicates drop district caseid participantid, force // if there are multiple decisions for the same defendant and case, I retain only the first to omit resentencings. 7k obs dropped

* clean and rename variables (judge and district: see final step)
encode leadcharge, gen(leadcharge)

replace lengthofprisonsentence="" if regexm( lengthofprisonsentence,"E")
gen life=lengthofprisonsentence=="LIF"
destring lengthofprisonsentence, gen(sentence) ignore("M" "LIF")
replace sentence = . if sentence<0 // one observation -6
recode sentence (470 / max = 470), gen(cprison)
replace cprison = 470 if life==1
gen logprison = ln(cprison)
recode cprison (0 = 0) (nonmissing = 1) (missing = .), gen(prison)

keep caseid participantid sentdate districtcode judgeid trial sentence prison cprison logprison life leadcharge

* adding "sleepy Monday" etc. date identifiers
merge m:1 sentdate using `calendar', keep(3) nogenerate // TRAC has some very old entries
gen controlstate = inlist(districtcode,"AZ","HI") | (inlist(districtcode,"INN","INS") & sentyr<2007)
gen sleepy = DSTbegin & !controlstate
gen outofsample = sentdate>=td(1oct2003) & !mi(sentdate) // USSC fiscal year starts on October 1

* final clean, compress, and save
encode districtcode, gen(district)
encode judgeid, gen(judge) // note that 75k cases have an unknown judge, as can be ascertained running: count if judge=="UN":judge
drop districtcode judgeid
xtset district
compress
save TRACdata.dta, replace
